Contents of Input Databases:
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, confusion_matrix, classification_report, accuracy_score
Loading Data:
dim_dates_df=pd.read_csv('dimDates_Task.csv')
posting_volume_df=pd.read_parquet('Posting_Volumes', engine='fastparquet')
clients_orders_df = pd.read_excel('X_ClientORDERS.xlsx')
temperature_files = glob.glob(os.path.join('Zadanie_Dane_Temperatura', '*.csv'))
dfs = []
for file in temperature_files:
df = pd.read_csv(file)
dfs.append(df)
temperature_df = pd.concat(dfs, ignore_index=True)
Helper Functions for Data Analysis:
def info(data):
print("Info about the data frame:")
display(data.info())
def head_and_tail(data):
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
print("First and last rows of the data frame:")
display(data.head(4))
display(data.tail(4))
def shape(data):
print('Number of rows and columns:')
print(data.shape)
def statistics(data):
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
print("Descriptive statistic of the data frame:")
display(data.describe(exclude=['object']))
def missing_values_statistic(data):
missing_values_count = data.isnull().sum()
missing_values_percent = (missing_values_count / len(data)) * 100
missing_data = pd.DataFrame({
'Missing values count': missing_values_count,
'Percentage of missing data': missing_values_percent
})
display(missing_data)
dim_dates_df.head()
| datePk;dateId;dateYear;dateYearISO;dateMonthYear;dateWeekYearISO;dateQuarterYear;dateMonth;dateDay;dateDaySuffix;dateWeekDay;dateWeekDayName;dateWeekDayNameShort;dateWeekDayNameFirstLetter;dateDayOfYear;dateWeekOfMonth;dateWeekOfYear;dateWeekOfYearISO;dateMonthName;dateMonthNameShort;dateMonthNameFirstLetter;dateQuarter;dateQuarterName;dateQuarterShort;dateIsWeekend;dateIsHolidayInd;dateWeekDayStartsMonday;dateWeekDayNamePL;dateMonthNamePL;dateWeekOfYearStartsMonday;firstDayWorkBefore;firstDayWorkAfter;firstDayNoHolidayNoSaturdayAfter;firstDayNoHolidayNoSundayAfter;firstDayNoHolidayNoSaturdayAfterData;firstDayNoSaturdayNoHolidayOrSundayHolidayName;firstDayNoSaturdayNoHolidayOrSundayHolidayIsHoliday;firstDayNoSaturdayNoHolidayOrSundayHoliday;isThuHolyday;isFriHolyday;isSatHolyday;isWeekendParcelServiceBreakInd;courierBillingPeriodDateFk;courierBillingPeriodStartDateFk;courierBillingPeriodEndDateFk;xlsDate;dwhDateFrom;dwhDateTo;dwhCurrRec;dwhAuditCd;dwhAuditCp;dwhAuditMd;dwhAuditMp | |
|---|---|
| 0 | 20201004;2020-10-04;2020;2020;10.2020;T40-2020... |
| 1 | 20200810;2020-08-10;2020;2020;08.2020;T33-2020... |
| 2 | 20200930;2020-09-30;2020;2020;09.2020;T40-2020... |
| 3 | 20200521;2020-05-21;2020;2020;05.2020;T21-2020... |
| 4 | 20200527;2020-05-27;2020;2020;05.2020;T22-2020... |
dim_dates_df = pd.read_csv('dimDates_Task.csv', delimiter=';')
head_and_tail(dim_dates_df)
First and last rows of the data frame:
| datePk | dateId | dateYear | dateYearISO | dateMonthYear | dateWeekYearISO | dateQuarterYear | dateMonth | dateDay | dateDaySuffix | dateWeekDay | dateWeekDayName | dateWeekDayNameShort | dateWeekDayNameFirstLetter | dateDayOfYear | dateWeekOfMonth | dateWeekOfYear | dateWeekOfYearISO | dateMonthName | dateMonthNameShort | dateMonthNameFirstLetter | dateQuarter | dateQuarterName | dateQuarterShort | dateIsWeekend | dateIsHolidayInd | dateWeekDayStartsMonday | dateWeekDayNamePL | dateMonthNamePL | dateWeekOfYearStartsMonday | firstDayWorkBefore | firstDayWorkAfter | firstDayNoHolidayNoSaturdayAfter | firstDayNoHolidayNoSundayAfter | firstDayNoHolidayNoSaturdayAfterData | firstDayNoSaturdayNoHolidayOrSundayHolidayName | firstDayNoSaturdayNoHolidayOrSundayHolidayIsHoliday | firstDayNoSaturdayNoHolidayOrSundayHoliday | isThuHolyday | isFriHolyday | isSatHolyday | isWeekendParcelServiceBreakInd | courierBillingPeriodDateFk | courierBillingPeriodStartDateFk | courierBillingPeriodEndDateFk | xlsDate | dwhDateFrom | dwhDateTo | dwhCurrRec | dwhAuditCd | dwhAuditCp | dwhAuditMd | dwhAuditMp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 20201004 | 2020-10-04 | 2020 | 2020 | 10.202 | T40-2020 | Q4-2020 | 10 | 4 | th | 1 | Sunday | SUN | S | 278 | 2 | 41 | 40 | October | OCT | O | 4 | Fourth | Qtr 4 | 1 | 0 | 7 | Niedziela | Październik | 40 | 20201002 | 20201005 | 20201005 | 20201005 | 20201005 | MON | 0 | 20201005 | 0 | 0 | 0 | NaN | 20201001 | 20201001 | 20201015 | 44108 | 1900-01-01 | 9999-12-31 | 1 | 2020-07-03 13:51:15.673 | 1904719 | 2020-07-03 13:51:15.673 | 1904719 |
| 1 | 20200810 | 2020-08-10 | 2020 | 2020 | 8.202 | T33-2020 | Q3-2020 | 8 | 10 | th | 2 | Monday | MON | M | 223 | 3 | 33 | 33 | August | AUG | A | 3 | Third | Qtr 3 | 0 | 0 | 1 | Poniedziałek | Sierpień | 33 | 20200807 | 20200811 | 20200811 | 20200811 | 20200811 | TUE | 0 | 20200811 | 0 | 0 | 0 | NaN | 20200801 | 20200801 | 20200815 | 44053 | 1900-01-01 | 9999-12-31 | 1 | 2020-07-03 13:51:15.673 | 1904719 | 2020-07-03 13:51:15.673 | 1904719 |
| 2 | 20200930 | 2020-09-30 | 2020 | 2020 | 9.202 | T40-2020 | Q3-2020 | 9 | 30 | th | 4 | Wednesday | WED | W | 274 | 5 | 40 | 40 | September | SEP | S | 3 | Third | Qtr 3 | 0 | 0 | 3 | Środa | Wrzesień | 40 | 20200929 | 20201001 | 20201001 | 20201001 | 20201001 | THU | 0 | 20201001 | 0 | 0 | 0 | NaN | 20200916 | 20200916 | 20200930 | 44104 | 1900-01-01 | 9999-12-31 | 1 | 2020-07-03 13:51:15.673 | 1904719 | 2020-07-03 13:51:15.673 | 1904719 |
| 3 | 20200521 | 2020-05-21 | 2020 | 2020 | 5.202 | T21-2020 | Q2-2020 | 5 | 21 | st | 5 | Thursday | THU | T | 142 | 4 | 21 | 21 | May | MAY | M | 2 | Second | Qtr 2 | 0 | 0 | 4 | Czwartek | Maj | 21 | 20200520 | 20200522 | 20200522 | 20200522 | 20200522 | FRI | 0 | 20200522 | 0 | 0 | 0 | NaN | 20200516 | 20200516 | 20200531 | 43972 | 1900-01-01 | 9999-12-31 | 1 | 2020-07-03 13:51:15.673 | 1904719 | 2020-07-03 13:51:15.673 | 1904719 |
| datePk | dateId | dateYear | dateYearISO | dateMonthYear | dateWeekYearISO | dateQuarterYear | dateMonth | dateDay | dateDaySuffix | dateWeekDay | dateWeekDayName | dateWeekDayNameShort | dateWeekDayNameFirstLetter | dateDayOfYear | dateWeekOfMonth | dateWeekOfYear | dateWeekOfYearISO | dateMonthName | dateMonthNameShort | dateMonthNameFirstLetter | dateQuarter | dateQuarterName | dateQuarterShort | dateIsWeekend | dateIsHolidayInd | dateWeekDayStartsMonday | dateWeekDayNamePL | dateMonthNamePL | dateWeekOfYearStartsMonday | firstDayWorkBefore | firstDayWorkAfter | firstDayNoHolidayNoSaturdayAfter | firstDayNoHolidayNoSundayAfter | firstDayNoHolidayNoSaturdayAfterData | firstDayNoSaturdayNoHolidayOrSundayHolidayName | firstDayNoSaturdayNoHolidayOrSundayHolidayIsHoliday | firstDayNoSaturdayNoHolidayOrSundayHoliday | isThuHolyday | isFriHolyday | isSatHolyday | isWeekendParcelServiceBreakInd | courierBillingPeriodDateFk | courierBillingPeriodStartDateFk | courierBillingPeriodEndDateFk | xlsDate | dwhDateFrom | dwhDateTo | dwhCurrRec | dwhAuditCd | dwhAuditCp | dwhAuditMd | dwhAuditMp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1457 | 20200417 | 2020-04-17 | 2020 | 2020 | 4.2020 | T16-2020 | Q2-2020 | 4 | 17 | th | 6 | Friday | FRI | F | 108 | 3 | 16 | 16 | April | APR | A | 2 | Second | Qtr 2 | 0 | 0 | 5 | Piątek | Kwiecień | 16 | 20200416 | 20200420 | 20200419 | 20200418 | 20200419 | SUN | 0 | 20200419 | 0 | 0 | 0 | NaN | 20200416 | 20200416 | 20200430 | 43938 | 1900-01-01 | 9999-12-31 | 1 | 2020-07-03 13:51:15.673 | 1904719 | 2020-07-03 13:51:15.673 | 1904719 |
| 1458 | 20201104 | 2020-11-04 | 2020 | 2020 | 11.2020 | T45-2020 | Q4-2020 | 11 | 4 | th | 4 | Wednesday | WED | W | 309 | 1 | 45 | 45 | November | NOV | N | 4 | Fourth | Qtr 4 | 0 | 0 | 3 | Środa | Listopad | 45 | 20201103 | 20201105 | 20201105 | 20201105 | 20201105 | THU | 0 | 20201105 | 0 | 0 | 0 | NaN | 20201101 | 20201101 | 20201115 | 44139 | 1900-01-01 | 9999-12-31 | 1 | 2020-07-03 13:51:15.673 | 1904719 | 2020-07-03 13:51:15.673 | 1904719 |
| 1459 | 20201116 | 2020-11-16 | 2020 | 2020 | 11.2020 | T47-2020 | Q4-2020 | 11 | 16 | th | 2 | Monday | MON | M | 321 | 3 | 47 | 47 | November | NOV | N | 4 | Fourth | Qtr 4 | 0 | 0 | 1 | Poniedziałek | Listopad | 47 | 20201113 | 20201117 | 20201117 | 20201117 | 20201117 | TUE | 0 | 20201117 | 0 | 0 | 0 | NaN | 20201116 | 20201116 | 20201130 | 44151 | 1900-01-01 | 9999-12-31 | 1 | 2020-07-03 13:51:15.673 | 1904719 | 2020-07-03 13:51:15.673 | 1904719 |
| 1460 | 20210117 | 2021-01-17 | 2021 | 2021 | 1.2021 | T2-2021 | Q1-2021 | 1 | 17 | th | 1 | Sunday | SUN | S | 17 | 4 | 4 | 2 | January | JAN | J | 1 | First | Qtr 1 | 1 | 0 | 7 | Niedziela | Styczeń | 3 | 20210115 | 20210118 | 20210118 | 20210118 | 20210118 | MON | 0 | 20210118 | 0 | 0 | 0 | 0.0 | 20210116 | 20210116 | 20210131 | 44213 | 1900-01-01 | 9999-12-31 | 1 | 2020-07-03 13:51:15.673 | 1904719 | 2020-07-03 13:51:15.673 | 1904719 |
shape(dim_dates_df)
Number of rows and columns: (1461, 53)
info(dim_dates_df)
Info about the data frame: <class 'pandas.core.frame.DataFrame'> RangeIndex: 1461 entries, 0 to 1460 Data columns (total 53 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 datePk 1461 non-null int64 1 dateId 1461 non-null object 2 dateYear 1461 non-null int64 3 dateYearISO 1461 non-null int64 4 dateMonthYear 1461 non-null float64 5 dateWeekYearISO 1461 non-null object 6 dateQuarterYear 1461 non-null object 7 dateMonth 1461 non-null int64 8 dateDay 1461 non-null int64 9 dateDaySuffix 1461 non-null object 10 dateWeekDay 1461 non-null int64 11 dateWeekDayName 1461 non-null object 12 dateWeekDayNameShort 1461 non-null object 13 dateWeekDayNameFirstLetter 1461 non-null object 14 dateDayOfYear 1461 non-null int64 15 dateWeekOfMonth 1461 non-null int64 16 dateWeekOfYear 1461 non-null int64 17 dateWeekOfYearISO 1461 non-null int64 18 dateMonthName 1461 non-null object 19 dateMonthNameShort 1461 non-null object 20 dateMonthNameFirstLetter 1461 non-null object 21 dateQuarter 1461 non-null int64 22 dateQuarterName 1461 non-null object 23 dateQuarterShort 1461 non-null object 24 dateIsWeekend 1461 non-null int64 25 dateIsHolidayInd 1461 non-null int64 26 dateWeekDayStartsMonday 1461 non-null int64 27 dateWeekDayNamePL 1461 non-null object 28 dateMonthNamePL 1461 non-null object 29 dateWeekOfYearStartsMonday 1461 non-null int64 30 firstDayWorkBefore 1461 non-null int64 31 firstDayWorkAfter 1461 non-null int64 32 firstDayNoHolidayNoSaturdayAfter 1461 non-null int64 33 firstDayNoHolidayNoSundayAfter 1461 non-null int64 34 firstDayNoHolidayNoSaturdayAfterData 1461 non-null int64 35 firstDayNoSaturdayNoHolidayOrSundayHolidayName 1461 non-null object 36 firstDayNoSaturdayNoHolidayOrSundayHolidayIsHoliday 1461 non-null int64 37 firstDayNoSaturdayNoHolidayOrSundayHoliday 1461 non-null int64 38 isThuHolyday 1461 non-null int64 39 isFriHolyday 1461 non-null int64 40 isSatHolyday 1461 non-null int64 41 isWeekendParcelServiceBreakInd 1095 non-null float64 42 courierBillingPeriodDateFk 1461 non-null int64 43 courierBillingPeriodStartDateFk 1461 non-null int64 44 courierBillingPeriodEndDateFk 1461 non-null int64 45 xlsDate 1461 non-null int64 46 dwhDateFrom 1461 non-null object 47 dwhDateTo 1461 non-null object 48 dwhCurrRec 1461 non-null int64 49 dwhAuditCd 1461 non-null object 50 dwhAuditCp 1461 non-null int64 51 dwhAuditMd 1461 non-null object 52 dwhAuditMp 1461 non-null int64 dtypes: float64(2), int64(32), object(19) memory usage: 605.1+ KB
None
statistics(dim_dates_df)
Descriptive statistic of the data frame:
| datePk | dateYear | dateYearISO | dateMonthYear | dateMonth | dateDay | dateWeekDay | dateDayOfYear | dateWeekOfMonth | dateWeekOfYear | dateWeekOfYearISO | dateQuarter | dateIsWeekend | dateIsHolidayInd | dateWeekDayStartsMonday | dateWeekOfYearStartsMonday | firstDayWorkBefore | firstDayWorkAfter | firstDayNoHolidayNoSaturdayAfter | firstDayNoHolidayNoSundayAfter | firstDayNoHolidayNoSaturdayAfterData | firstDayNoSaturdayNoHolidayOrSundayHolidayIsHoliday | firstDayNoSaturdayNoHolidayOrSundayHoliday | isThuHolyday | isFriHolyday | isSatHolyday | isWeekendParcelServiceBreakInd | courierBillingPeriodDateFk | courierBillingPeriodStartDateFk | courierBillingPeriodEndDateFk | xlsDate | dwhCurrRec | dwhAuditCp | dwhAuditMp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.461000e+03 | 1461.000000 | 1461.000000 | 1461.000000 | 1461.000000 | 1461.000000 | 1461.000000 | 1461.000000 | 1461.000000 | 1461.000000 | 1461.000000 | 1461.000000 | 1461.000000 | 1461.000000 | 1461.000000 | 1461.000000 | 1.461000e+03 | 1.461000e+03 | 1.461000e+03 | 1.461000e+03 | 1.461000e+03 | 1461.000000 | 1.461000e+03 | 1461.000000 | 1461.000000 | 1461.000000 | 1095.000000 | 1.461000e+03 | 1.461000e+03 | 1.461000e+03 | 1461.000000 | 1461.0 | 1461.0 | 1.461000e+03 |
| mean | 2.021566e+07 | 2021.498973 | 2021.494867 | 6.725079 | 6.522930 | 15.729637 | 4.002053 | 183.125257 | 3.116359 | 27.088980 | 26.661875 | 2.508556 | 0.286105 | 0.038330 | 4.003422 | 27.195756 | 2.021559e+07 | 2.021572e+07 | 2.021570e+07 | 2.021570e+07 | 2.021570e+07 | 0.021903 | 2.021570e+07 | 0.021903 | 0.019165 | 0.016427 | 0.020091 | 2.021565e+07 | 2.021565e+07 | 2.021566e+07 | 44561.000000 | 1.0 | 1904719.0 | 1.900809e+06 |
| std | 1.119295e+04 | 1.118723 | 1.119935 | 3.449884 | 3.449884 | 8.803105 | 2.001197 | 105.474746 | 1.320082 | 15.072801 | 15.069269 | 1.117465 | 0.452094 | 0.192057 | 1.999826 | 15.071792 | 1.122535e+04 | 1.122521e+04 | 1.120355e+04 | 1.120912e+04 | 1.120934e+04 | 0.146416 | 1.120947e+04 | 0.146416 | 0.137152 | 0.127155 | 0.140377 | 1.119296e+04 | 1.119296e+04 | 1.119295e+04 | 421.898685 | 0.0 | 0.0 | 8.623693e+04 |
| min | 2.020010e+07 | 2020.000000 | 2020.000000 | 1.202000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 2.019123e+07 | 2.020010e+07 | 2.020010e+07 | 2.020010e+07 | 2.020010e+07 | 0.000000 | 2.020010e+07 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.020010e+07 | 2.020010e+07 | 2.020012e+07 | 43831.000000 | 1.0 | 1904719.0 | 3.310000e+02 |
| 25% | 2.020123e+07 | 2020.000000 | 2020.000000 | 4.202000 | 4.000000 | 8.000000 | 2.000000 | 92.000000 | 2.000000 | 14.000000 | 14.000000 | 2.000000 | 0.000000 | 0.000000 | 2.000000 | 14.000000 | 2.020123e+07 | 2.021010e+07 | 2.021010e+07 | 2.021010e+07 | 2.021010e+07 | 0.000000 | 2.021010e+07 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.020122e+07 | 2.020122e+07 | 2.020123e+07 | 44196.000000 | 1.0 | 1904719.0 | 1.904719e+06 |
| 50% | 2.021123e+07 | 2021.000000 | 2021.000000 | 7.202000 | 7.000000 | 16.000000 | 4.000000 | 183.000000 | 3.000000 | 27.000000 | 27.000000 | 3.000000 | 0.000000 | 0.000000 | 4.000000 | 27.000000 | 2.021123e+07 | 2.022010e+07 | 2.022010e+07 | 2.022010e+07 | 2.022010e+07 | 0.000000 | 2.022010e+07 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.021122e+07 | 2.021122e+07 | 2.021123e+07 | 44561.000000 | 1.0 | 1904719.0 | 1.904719e+06 |
| 75% | 2.022123e+07 | 2022.000000 | 2022.000000 | 10.202000 | 10.000000 | 23.000000 | 6.000000 | 274.000000 | 4.000000 | 40.000000 | 40.000000 | 4.000000 | 1.000000 | 0.000000 | 6.000000 | 40.000000 | 2.022123e+07 | 2.023010e+07 | 2.023010e+07 | 2.023010e+07 | 2.023010e+07 | 0.000000 | 2.023010e+07 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.022122e+07 | 2.022122e+07 | 2.022123e+07 | 44926.000000 | 1.0 | 1904719.0 | 1.904719e+06 |
| max | 2.023123e+07 | 2023.000000 | 2023.000000 | 12.202300 | 12.000000 | 31.000000 | 7.000000 | 366.000000 | 6.000000 | 53.000000 | 53.000000 | 4.000000 | 1.000000 | 1.000000 | 7.000000 | 53.000000 | 2.023123e+07 | 2.024010e+07 | 2.024010e+07 | 2.024010e+07 | 2.024010e+07 | 1.000000 | 2.024010e+07 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 2.023122e+07 | 2.023122e+07 | 2.023123e+07 | 45291.000000 | 1.0 | 1904719.0 | 1.904719e+06 |
object_columns = [
'dateId', 'dateDaySuffix', 'dateWeekDayName', 'dateWeekDayNameShort',
'dateWeekDayNameFirstLetter', 'dateMonthName', 'dateMonthNameShort',
'dateMonthNameFirstLetter', 'dateQuarterName', 'dateQuarterShort',
'dateWeekDayNamePL', 'dateMonthNamePL', 'firstDayNoSaturdayNoHolidayOrSundayHolidayName'
]
for column in object_columns:
unique_values = dim_dates_df[column].unique()
print(f"Unique values in column '{column}':")
print(unique_values)
print("\n")
Unique values in column 'dateId': ['2020-10-04' '2020-08-10' '2020-09-30' ... '2020-11-04' '2020-11-16' '2021-01-17'] Unique values in column 'dateDaySuffix': ['th' 'st' 'rd' 'nd'] Unique values in column 'dateWeekDayName': ['Sunday' 'Monday' 'Wednesday' 'Thursday' 'Saturday' 'Tuesday' 'Friday'] Unique values in column 'dateWeekDayNameShort': ['SUN' 'MON' 'WED' 'THU' 'SAT' 'TUE' 'FRI'] Unique values in column 'dateWeekDayNameFirstLetter': ['S' 'M' 'W' 'T' 'F'] Unique values in column 'dateMonthName': ['October' 'August' 'September' 'May' 'June' 'February' 'November' 'March' 'April' 'January' 'December' 'July'] Unique values in column 'dateMonthNameShort': ['OCT' 'AUG' 'SEP' 'MAY' 'JUN' 'FEB' 'NOV' 'MAR' 'APR' 'JAN' 'DEC' 'JUL'] Unique values in column 'dateMonthNameFirstLetter': ['O' 'A' 'S' 'M' 'J' 'F' 'N' 'D'] Unique values in column 'dateQuarterName': ['Fourth' 'Third' 'Second' 'First'] Unique values in column 'dateQuarterShort': ['Qtr 4' 'Qtr 3' 'Qtr 2' 'Qtr 1'] Unique values in column 'dateWeekDayNamePL': ['Niedziela' 'Poniedziałek' 'Środa' 'Czwartek' 'Sobota' 'Wtorek' 'Piątek'] Unique values in column 'dateMonthNamePL': ['Październik' 'Sierpień' 'Wrzesień' 'Maj' 'Czerwiec' 'Luty' 'Listopad' 'Marzec' 'Kwiecień' 'Styczeń' 'Grudzień' 'Lipiec'] Unique values in column 'firstDayNoSaturdayNoHolidayOrSundayHolidayName': ['MON' 'TUE' 'THU' 'FRI' 'SUN' 'WED']
Columns containing object data do not contain incorrect (incorrectly written or inconsistent with reality) data.
missing_values_statistic(dim_dates_df)
| Missing values count | Percentage of missing data | |
|---|---|---|
| datePk | 0 | 0.000000 |
| dateId | 0 | 0.000000 |
| dateYear | 0 | 0.000000 |
| dateYearISO | 0 | 0.000000 |
| dateMonthYear | 0 | 0.000000 |
| dateWeekYearISO | 0 | 0.000000 |
| dateQuarterYear | 0 | 0.000000 |
| dateMonth | 0 | 0.000000 |
| dateDay | 0 | 0.000000 |
| dateDaySuffix | 0 | 0.000000 |
| dateWeekDay | 0 | 0.000000 |
| dateWeekDayName | 0 | 0.000000 |
| dateWeekDayNameShort | 0 | 0.000000 |
| dateWeekDayNameFirstLetter | 0 | 0.000000 |
| dateDayOfYear | 0 | 0.000000 |
| dateWeekOfMonth | 0 | 0.000000 |
| dateWeekOfYear | 0 | 0.000000 |
| dateWeekOfYearISO | 0 | 0.000000 |
| dateMonthName | 0 | 0.000000 |
| dateMonthNameShort | 0 | 0.000000 |
| dateMonthNameFirstLetter | 0 | 0.000000 |
| dateQuarter | 0 | 0.000000 |
| dateQuarterName | 0 | 0.000000 |
| dateQuarterShort | 0 | 0.000000 |
| dateIsWeekend | 0 | 0.000000 |
| dateIsHolidayInd | 0 | 0.000000 |
| dateWeekDayStartsMonday | 0 | 0.000000 |
| dateWeekDayNamePL | 0 | 0.000000 |
| dateMonthNamePL | 0 | 0.000000 |
| dateWeekOfYearStartsMonday | 0 | 0.000000 |
| firstDayWorkBefore | 0 | 0.000000 |
| firstDayWorkAfter | 0 | 0.000000 |
| firstDayNoHolidayNoSaturdayAfter | 0 | 0.000000 |
| firstDayNoHolidayNoSundayAfter | 0 | 0.000000 |
| firstDayNoHolidayNoSaturdayAfterData | 0 | 0.000000 |
| firstDayNoSaturdayNoHolidayOrSundayHolidayName | 0 | 0.000000 |
| firstDayNoSaturdayNoHolidayOrSundayHolidayIsHoliday | 0 | 0.000000 |
| firstDayNoSaturdayNoHolidayOrSundayHoliday | 0 | 0.000000 |
| isThuHolyday | 0 | 0.000000 |
| isFriHolyday | 0 | 0.000000 |
| isSatHolyday | 0 | 0.000000 |
| isWeekendParcelServiceBreakInd | 366 | 25.051335 |
| courierBillingPeriodDateFk | 0 | 0.000000 |
| courierBillingPeriodStartDateFk | 0 | 0.000000 |
| courierBillingPeriodEndDateFk | 0 | 0.000000 |
| xlsDate | 0 | 0.000000 |
| dwhDateFrom | 0 | 0.000000 |
| dwhDateTo | 0 | 0.000000 |
| dwhCurrRec | 0 | 0.000000 |
| dwhAuditCd | 0 | 0.000000 |
| dwhAuditCp | 0 | 0.000000 |
| dwhAuditMd | 0 | 0.000000 |
| dwhAuditMp | 0 | 0.000000 |
Only the isWeekendParcelServiceBreakInd column contains missing values. Missing values are 25% of all rows in this column.
head_and_tail(posting_volume_df)
First and last rows of the data frame:
| postingDateFk | Product | Customer | Volume | |
|---|---|---|---|---|
| 0 | 20210101 | APM | X | 2401.301452 |
| 1 | 20210102 | APM | X | 20195.515269 |
| 2 | 20210103 | APM | X | 63561.702640 |
| 3 | 20210104 | APM | X | 40310.645622 |
| postingDateFk | Product | Customer | Volume | |
|---|---|---|---|---|
| 3888 | 20230828 | COURIER | Rest | 234648.548766 |
| 3889 | 20230829 | COURIER | Rest | 333947.920499 |
| 3890 | 20230830 | COURIER | Rest | 324682.815539 |
| 3891 | 20230831 | COURIER | Rest | 311090.341929 |
shape(posting_volume_df)
Number of rows and columns: (3892, 4)
info(posting_volume_df)
Info about the data frame: <class 'pandas.core.frame.DataFrame'> Index: 3892 entries, 0 to 3891 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 postingDateFk 3892 non-null object 1 Product 3892 non-null object 2 Customer 3892 non-null object 3 Volume 3892 non-null float64 dtypes: float64(1), object(3) memory usage: 152.0+ KB
None
statistics(posting_volume_df)
Descriptive statistic of the data frame:
| Volume | |
|---|---|
| count | 3892.000000 |
| mean | 143536.470533 |
| std | 87923.899208 |
| min | -876.759667 |
| 25% | 80150.606161 |
| 50% | 130202.764469 |
| 75% | 193735.702340 |
| max | 582617.649973 |
negative_values_count = posting_volume_df[posting_volume_df['Volume'] < 0].shape[0]
print(f"Number of negative values in 'Volume': {negative_values_count}")
Number of negative values in 'Volume': 4
There are 4 negative values in the Volume column, which is incorrect as daily shipments should not have a negative value.
unique_products = posting_volume_df['Product'].unique()
print("Unique values in Product column:")
print(unique_products)
unique_customers = posting_volume_df['Customer'].unique()
print("Unique values in Customer column:")
print(unique_customers)
Unique values in Product column: ['APM' 'COURIER'] Unique values in Customer column: ['X' 'Rest']
# Boxplot for Volume values depending on product and customer
plt.figure(figsize=(12, 6))
sns.boxplot(x='Product', y='Volume', hue='Customer', data=posting_volume_df, palette=['gold', 'darkgray'])
plt.title('Boxplot of Volume by Product and Customer')
plt.ylabel('Volume')
plt.xlabel('Product')
plt.legend(title='Customer')
plt.show()
gold_color = "#FFD700"
# Histogram for Volume values depending on the product and customer
g = sns.FacetGrid(posting_volume_df, col="Product", row="Customer", margin_titles=True, height=3, aspect=1.5, )
g.map(sns.histplot, "Volume", bins=30, color=gold_color)
g.set_axis_labels("Volume", "Frequency")
g.fig.suptitle('Distribution of Volume by Product and Customer', y=1.0)
plt.show()
C:\Users\sonia\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
The differences in data between customer X and Rest are explained by the fact that X is an isolated customer, and Rest is the rest of the customers. For this reason, volumes reach larger and more variable values for Rest.
missing_values_statistic(posting_volume_df)
| Missing values count | Percentage of missing data | |
|---|---|---|
| postingDateFk | 0 | 0.0 |
| Product | 0 | 0.0 |
| Customer | 0 | 0.0 |
| Volume | 0 | 0.0 |
None.
head_and_tail(clients_orders_df)
First and last rows of the data frame:
| DateId | Orders | |
|---|---|---|
| 0 | 20230101 | 161366.064671 |
| 1 | 20230102 | 296627.324653 |
| 2 | 20230103 | 378262.368584 |
| 3 | 20230104 | 320907.583398 |
| DateId | Orders | |
|---|---|---|
| 239 | 20230828 | 334471.057583 |
| 240 | 20230829 | 407334.926198 |
| 241 | 20230830 | 354357.441720 |
| 242 | 20230831 | NaN |
shape(clients_orders_df)
Number of rows and columns: (243, 2)
info(clients_orders_df)
Info about the data frame: <class 'pandas.core.frame.DataFrame'> RangeIndex: 243 entries, 0 to 242 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DateId 243 non-null int64 1 Orders 242 non-null float64 dtypes: float64(1), int64(1) memory usage: 3.9 KB
None
statistics(clients_orders_df)
Descriptive statistic of the data frame:
| DateId | Orders | |
|---|---|---|
| count | 2.430000e+02 | 242.000000 |
| mean | 2.023047e+07 | 275094.376827 |
| std | 2.303691e+02 | 70057.800956 |
| min | 2.023010e+07 | 72473.494555 |
| 25% | 2.023030e+07 | 224505.534006 |
| 50% | 2.023050e+07 | 273695.994969 |
| 75% | 2.023070e+07 | 325644.229278 |
| max | 2.023083e+07 | 416068.107760 |
DateId: Date in YYYYMMDD format (can be used as Foreign key).
Orders: Number of orders.
# Histogram for Orders
plt.figure(figsize=(10, 6))
sns.histplot(clients_orders_df['Orders'].dropna(), bins=30, kde=True, color=gold_color)
plt.title('Histogram of Orders')
plt.xlabel('Orders')
plt.ylabel('Frequency')
plt.show()
missing_values_statistic(clients_orders_df)
| Missing values count | Percentage of missing data | |
|---|---|---|
| DateId | 0 | 0.000000 |
| Orders | 1 | 0.411523 |
The Orders column has one missing value, which is 0.41% of the total data in the column.
head_and_tail(temperature_df)
First and last rows of the data frame:
| Nazwa stacji | Rok | Miesiac | Dzien | Maksymalna temperatura dobowa | Minimalna temperatura dobowa | Srednia temperatura dobowa | Suma dobowa opadow [mm] | Pokrywa sniegu [cm] | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | KRAKOW | 2021 | 1 | 1 | 0.5 | -1.2 | -0.5 | 0.0 | 0 |
| 1 | KRAKOW | 2021 | 1 | 2 | 4.4 | -1.6 | 0.8 | 0.0 | 0 |
| 2 | KRAKOW | 2021 | 1 | 3 | 4.0 | -0.9 | 1.1 | 0.0 | 0 |
| 3 | KRAKOW | 2021 | 1 | 4 | 4.9 | -0.7 | 1.7 | 7.1 | 0 |
| Nazwa stacji | Rok | Miesiac | Dzien | Maksymalna temperatura dobowa | Minimalna temperatura dobowa | Srednia temperatura dobowa | Suma dobowa opadow [mm] | Pokrywa sniegu [cm] | |
|---|---|---|---|---|---|---|---|---|---|
| 1942 | WARSZAWA | 2022 | 12 | 28 | 4.9 | -0.2 | 2.5 | 0.0 | 0 |
| 1943 | WARSZAWA | 2022 | 12 | 29 | 8.3 | 3.3 | 5.8 | 0.0 | 0 |
| 1944 | WARSZAWA | 2022 | 12 | 30 | 8.7 | 4.0 | 5.9 | 0.0 | 0 |
| 1945 | WARSZAWA | 2022 | 12 | 31 | 13.6 | 0.6 | 8.3 | 0.0 | 0 |
shape(temperature_df)
Number of rows and columns: (1946, 9)
info(temperature_df)
Info about the data frame: <class 'pandas.core.frame.DataFrame'> RangeIndex: 1946 entries, 0 to 1945 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Nazwa stacji 1946 non-null object 1 Rok 1946 non-null int64 2 Miesiac 1946 non-null int64 3 Dzien 1946 non-null int64 4 Maksymalna temperatura dobowa 1946 non-null float64 5 Minimalna temperatura dobowa 1946 non-null float64 6 Srednia temperatura dobowa 1946 non-null float64 7 Suma dobowa opadow [mm] 1946 non-null float64 8 Pokrywa sniegu [cm] 1946 non-null int64 dtypes: float64(4), int64(4), object(1) memory usage: 137.0+ KB
None
unique_values = temperature_df['Nazwa stacji'].unique()
print(f"Unique values in column Nazwa stacji:")
print(unique_values)
Unique values in column Nazwa stacji: ['KRAKOW' 'WARSZAWA']
Brak błędnych danych typu object.
statistics(temperature_df)
Descriptive statistic of the data frame:
| Rok | Miesiac | Dzien | Maksymalna temperatura dobowa | Minimalna temperatura dobowa | Srednia temperatura dobowa | Suma dobowa opadow [mm] | Pokrywa sniegu [cm] | |
|---|---|---|---|---|---|---|---|---|
| count | 1946.000000 | 1946.000000 | 1946.000000 | 1946.000000 | 1946.000000 | 1946.000000 | 1946.000000 | 1946.000000 |
| mean | 2021.874615 | 6.026721 | 15.716341 | 14.826310 | 6.286948 | 10.062384 | 1.078263 | 0.345838 |
| std | 0.780681 | 3.315044 | 8.797155 | 9.660984 | 7.420518 | 8.285873 | 4.161008 | 2.033935 |
| min | 2021.000000 | 1.000000 | 1.000000 | -12.800000 | -17.500000 | -15.700000 | 0.000000 | 0.000000 |
| 25% | 2021.000000 | 3.000000 | 8.000000 | 6.700000 | 0.400000 | 3.100000 | 0.000000 | 0.000000 |
| 50% | 2022.000000 | 6.000000 | 16.000000 | 15.400000 | 6.100000 | 10.000000 | 0.000000 | 0.000000 |
| 75% | 2022.000000 | 9.000000 | 23.000000 | 22.800000 | 12.500000 | 17.200000 | 0.000000 | 0.000000 |
| max | 2023.000000 | 12.000000 | 31.000000 | 36.700000 | 21.900000 | 28.200000 | 103.400000 | 27.000000 |
missing_values_statistic(temperature_df)
| Missing values count | Percentage of missing data | |
|---|---|---|
| Nazwa stacji | 0 | 0.0 |
| Rok | 0 | 0.0 |
| Miesiac | 0 | 0.0 |
| Dzien | 0 | 0.0 |
| Maksymalna temperatura dobowa | 0 | 0.0 |
| Minimalna temperatura dobowa | 0 | 0.0 |
| Srednia temperatura dobowa | 0 | 0.0 |
| Suma dobowa opadow [mm] | 0 | 0.0 |
| Pokrywa sniegu [cm] | 0 | 0.0 |
None.
value_counts = dim_dates_df['isWeekendParcelServiceBreakInd'].value_counts()
count_of_ones = value_counts.get(1)
rows = dim_dates_df.shape[0]
percantage_of_ones = round((count_of_ones / rows), 2)
print(f"Count of ones: {count_of_ones}")
print(f"Percantage of ones: {percantage_of_ones}%")
Count of ones: 22 Percantage of ones: 0.02%
The isWeekendParcelServiceBreakInd column contains the weekend courier service break indicator (0/1). As the statistics show, the number of 1 values in this column is very small. This means that the value 0 is the dominant category. In this situation, when about 25% of the data in this column is missing, using the dominant value (0) to fill in the missing data seems to be the right strategy.
Since the number of days with courier service breaks (1) is very small, their impact on the overall results of the prediction model will also be small, given the large amount of other data in the DimDates table. Filling in the missing values as 0 should not affect the overall predictions.
most_frequent_value = dim_dates_df['isWeekendParcelServiceBreakInd'].mode()[0]
dim_dates_df['isWeekendParcelServiceBreakInd'].fillna(most_frequent_value, inplace=True)
missing_values_after = dim_dates_df['isWeekendParcelServiceBreakInd'].isnull().sum()
print("\nMissing values after filling:")
print(missing_values_after)
Missing values after filling: 0
dim_dates_df['datePk'] = pd.to_datetime(dim_dates_df['datePk'], format='%Y%m%d')
print("\nData type of datePk after conversion:")
print(dim_dates_df['datePk'].dtype)
Data type of datePk after conversion: datetime64[ns]
dim_dates_df=dim_dates_df.rename(columns={"datePk": "date"})
print(posting_volume_df[posting_volume_df['Volume'] < 0])
negative_volume_count = posting_volume_df[posting_volume_df['Volume'] < 0].shape[0]
rows = posting_volume_df.shape[0]
percantage_of_negative_volume = round((negative_volume_count/rows), 3)
print(f"Percantage of negative values: {percantage_of_negative_volume}%")
postingDateFk Product Customer Volume 17 20210118 APM X -501.072484 990 20210118 APM Rest -565.057691 1963 20210118 COURIER X -421.967589 2936 20210118 COURIER Rest -876.759667 Percantage of negative values: 0.001%
Negative values in the Volume column are unexpected and indicate errors in the data, because the volume of shipments should be a positive value. All these values have the same shipment date, which may indicate an error when entering the data. Due to the very small amount of such data, they can be replaced with 0.
posting_volume_df['Volume'] = posting_volume_df['Volume'].apply(lambda x: max(x, 0))
negative_values_count = posting_volume_df[posting_volume_df['Volume'] < 0].shape[0]
print(f"Negative values count: {negative_values_count}")
Negative values count: 0
posting_volume_df['postingDateFk'] = pd.to_datetime(posting_volume_df['postingDateFk'], format='%Y%m%d')
print("\nData type of postingDateFk after conversion:")
print(posting_volume_df['postingDateFk'].dtypes)
Data type of postingDateFk after conversion: datetime64[ns]
posting_volume_df=posting_volume_df.rename(columns={"postingDateFk": "date"})
daily_volume_df = posting_volume_df.groupby('date')['Volume'].sum().reset_index()
daily_volume_df.head()
shape(daily_volume_df)
Number of rows and columns: (973, 2)
There is one missing value in the Orders column. This can be resolved by replacing it with the median of the data in the column.
clients_orders_df['Orders'].fillna(clients_orders_df['Orders'].mean(), inplace=True)
missing_values_statistic(clients_orders_df)
| Missing values count | Percentage of missing data | |
|---|---|---|
| DateId | 0 | 0.0 |
| Orders | 0 | 0.0 |
clients_orders_df['DateId'] = pd.to_datetime(clients_orders_df['DateId'], format='%Y%m%d')
print("\nData type of DateId after conversion:")
print(clients_orders_df['DateId'].dtypes)
Data type of DateId after conversion: datetime64[ns]
clients_orders_df=clients_orders_df.rename(columns={"DateId": "date"})
temperature_df.rename(columns={'Rok': 'year', 'Miesiac': 'month', 'Dzien': 'day'}, inplace=True)
temperature_df['date'] = pd.to_datetime(temperature_df[['year', 'month', 'day']])
temperature_df.drop(columns=['year', 'month', 'day'], inplace=True)
temperature_df.head(3)
| Nazwa stacji | Maksymalna temperatura dobowa | Minimalna temperatura dobowa | Srednia temperatura dobowa | Suma dobowa opadow [mm] | Pokrywa sniegu [cm] | date | |
|---|---|---|---|---|---|---|---|
| 0 | KRAKOW | 0.5 | -1.2 | -0.5 | 0.0 | 0 | 2021-01-01 |
| 1 | KRAKOW | 4.4 | -1.6 | 0.8 | 0.0 | 0 | 2021-01-02 |
| 2 | KRAKOW | 4.0 | -0.9 | 1.1 | 0.0 | 0 | 2021-01-03 |
temperature_avg_df = temperature_df.groupby('date').agg({
'Maksymalna temperatura dobowa': 'mean',
'Minimalna temperatura dobowa': 'mean',
'Srednia temperatura dobowa': 'mean',
'Suma dobowa opadow [mm]': 'mean',
'Pokrywa sniegu [cm]': 'mean'
}).reset_index()
temperature_avg_df.columns = ['date', 'AvgMaxTemp', 'AvgMinTemp', 'AvgMeanTemp', 'AvgPrecipitation', 'AvgSnowCover']
shape(temperature_avg_df)
Number of rows and columns: (973, 6)
head_and_tail(temperature_avg_df)
First and last rows of the data frame:
| date | AvgMaxTemp | AvgMinTemp | AvgMeanTemp | AvgPrecipitation | AvgSnowCover | |
|---|---|---|---|---|---|---|
| 0 | 2021-01-01 | 0.70 | -1.20 | -0.35 | 0.00 | 0.0 |
| 1 | 2021-01-02 | 2.80 | -1.25 | 0.55 | 0.00 | 0.0 |
| 2 | 2021-01-03 | 3.20 | -0.05 | 1.15 | 0.00 | 0.0 |
| 3 | 2021-01-04 | 4.15 | -0.10 | 1.95 | 3.55 | 0.0 |
| date | AvgMaxTemp | AvgMinTemp | AvgMeanTemp | AvgPrecipitation | AvgSnowCover | |
|---|---|---|---|---|---|---|
| 969 | 2023-08-28 | 23.25 | 17.55 | 19.45 | 0.05 | 0.0 |
| 970 | 2023-08-29 | 25.55 | 17.85 | 21.05 | 0.45 | 0.0 |
| 971 | 2023-08-30 | 21.80 | 16.70 | 18.30 | 0.15 | 0.0 |
| 972 | 2023-08-31 | 22.45 | 14.80 | 17.05 | 0.05 | 0.0 |
combined_df = daily_volume_df.merge(dim_dates_df, on='date', how='left') \
.merge(temperature_avg_df, on='date', how='left') \
.merge(clients_orders_df, on='date', how='left')
head_and_tail(combined_df)
First and last rows of the data frame:
| date | Volume | dateId | dateYear | dateYearISO | dateMonthYear | dateWeekYearISO | dateQuarterYear | dateMonth | dateDay | dateDaySuffix | dateWeekDay | dateWeekDayName | dateWeekDayNameShort | dateWeekDayNameFirstLetter | dateDayOfYear | dateWeekOfMonth | dateWeekOfYear | dateWeekOfYearISO | dateMonthName | dateMonthNameShort | dateMonthNameFirstLetter | dateQuarter | dateQuarterName | dateQuarterShort | dateIsWeekend | dateIsHolidayInd | dateWeekDayStartsMonday | dateWeekDayNamePL | dateMonthNamePL | dateWeekOfYearStartsMonday | firstDayWorkBefore | firstDayWorkAfter | firstDayNoHolidayNoSaturdayAfter | firstDayNoHolidayNoSundayAfter | firstDayNoHolidayNoSaturdayAfterData | firstDayNoSaturdayNoHolidayOrSundayHolidayName | firstDayNoSaturdayNoHolidayOrSundayHolidayIsHoliday | firstDayNoSaturdayNoHolidayOrSundayHoliday | isThuHolyday | isFriHolyday | isSatHolyday | isWeekendParcelServiceBreakInd | courierBillingPeriodDateFk | courierBillingPeriodStartDateFk | courierBillingPeriodEndDateFk | xlsDate | dwhDateFrom | dwhDateTo | dwhCurrRec | dwhAuditCd | dwhAuditCp | dwhAuditMd | dwhAuditMp | AvgMaxTemp | AvgMinTemp | AvgMeanTemp | AvgPrecipitation | AvgSnowCover | Orders | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2021-01-01 | 11367.648290 | 2021-01-01 | 2021 | 2020 | 1.2021 | T53-2020 | Q1-2021 | 1 | 1 | st | 6 | Friday | FRI | F | 1 | 1 | 1 | 53 | January | JAN | J | 1 | First | Qtr 1 | 0 | 1 | 5 | Piątek | Styczeń | 1 | 20201230 | 20210104 | 20210103 | 20210102 | 20210103 | SUN | 0 | 20210103 | 1 | 1 | 0 | 0.0 | 20210101 | 20210101 | 20210115 | 44197 | 1900-01-01 | 9999-12-31 | 1 | 2020-07-03 13:51:15.673 | 1904719 | 2020-07-03 13:51:15.673 | 1904719 | 0.70 | -1.20 | -0.35 | 0.00 | 0.0 | NaN |
| 1 | 2021-01-02 | 106467.612644 | 2021-01-02 | 2021 | 2020 | 1.2021 | T53-2020 | Q1-2021 | 1 | 2 | nd | 7 | Saturday | SAT | S | 2 | 1 | 1 | 53 | January | JAN | J | 1 | First | Qtr 1 | 1 | 0 | 6 | Sobota | Styczeń | 1 | 20201230 | 20210104 | 20210103 | 20210104 | 20210103 | SUN | 0 | 20210103 | 1 | 1 | 0 | 1.0 | 20210101 | 20210101 | 20210115 | 44198 | 1900-01-01 | 9999-12-31 | 1 | 2020-07-03 13:51:15.673 | 1904719 | 2020-07-03 13:51:15.673 | 1904719 | 2.80 | -1.25 | 0.55 | 0.00 | 0.0 | NaN |
| 2 | 2021-01-03 | 298554.245374 | 2021-01-03 | 2021 | 2020 | 1.2021 | T53-2020 | Q1-2021 | 1 | 3 | rd | 1 | Sunday | SUN | S | 3 | 2 | 2 | 53 | January | JAN | J | 1 | First | Qtr 1 | 1 | 0 | 7 | Niedziela | Styczeń | 1 | 20201230 | 20210104 | 20210104 | 20210104 | 20210104 | MON | 0 | 20210104 | 0 | 0 | 0 | 0.0 | 20210101 | 20210101 | 20210115 | 44199 | 1900-01-01 | 9999-12-31 | 1 | 2020-07-03 13:51:15.673 | 1904719 | 2020-07-03 13:51:15.673 | 1904719 | 3.20 | -0.05 | 1.15 | 0.00 | 0.0 | NaN |
| 3 | 2021-01-04 | 167551.554724 | 2021-01-04 | 2021 | 2021 | 1.2021 | T1-2021 | Q1-2021 | 1 | 4 | th | 2 | Monday | MON | M | 4 | 2 | 2 | 1 | January | JAN | J | 1 | First | Qtr 1 | 0 | 0 | 1 | Poniedziałek | Styczeń | 2 | 20201230 | 20210105 | 20210105 | 20210105 | 20210105 | TUE | 0 | 20210105 | 0 | 0 | 0 | 0.0 | 20210101 | 20210101 | 20210115 | 44200 | 1900-01-01 | 9999-12-31 | 1 | 2020-07-03 13:51:15.673 | 1904719 | 2020-07-03 13:51:15.673 | 1904719 | 4.15 | -0.10 | 1.95 | 3.55 | 0.0 | NaN |
| date | Volume | dateId | dateYear | dateYearISO | dateMonthYear | dateWeekYearISO | dateQuarterYear | dateMonth | dateDay | dateDaySuffix | dateWeekDay | dateWeekDayName | dateWeekDayNameShort | dateWeekDayNameFirstLetter | dateDayOfYear | dateWeekOfMonth | dateWeekOfYear | dateWeekOfYearISO | dateMonthName | dateMonthNameShort | dateMonthNameFirstLetter | dateQuarter | dateQuarterName | dateQuarterShort | dateIsWeekend | dateIsHolidayInd | dateWeekDayStartsMonday | dateWeekDayNamePL | dateMonthNamePL | dateWeekOfYearStartsMonday | firstDayWorkBefore | firstDayWorkAfter | firstDayNoHolidayNoSaturdayAfter | firstDayNoHolidayNoSundayAfter | firstDayNoHolidayNoSaturdayAfterData | firstDayNoSaturdayNoHolidayOrSundayHolidayName | firstDayNoSaturdayNoHolidayOrSundayHolidayIsHoliday | firstDayNoSaturdayNoHolidayOrSundayHoliday | isThuHolyday | isFriHolyday | isSatHolyday | isWeekendParcelServiceBreakInd | courierBillingPeriodDateFk | courierBillingPeriodStartDateFk | courierBillingPeriodEndDateFk | xlsDate | dwhDateFrom | dwhDateTo | dwhCurrRec | dwhAuditCd | dwhAuditCp | dwhAuditMd | dwhAuditMp | AvgMaxTemp | AvgMinTemp | AvgMeanTemp | AvgPrecipitation | AvgSnowCover | Orders | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 969 | 2023-08-28 | 624376.598443 | 2023-08-28 | 2023 | 2023 | 8.2023 | T35-2023 | Q3-2023 | 8 | 28 | th | 2 | Monday | MON | M | 240 | 5 | 35 | 35 | August | AUG | A | 3 | Third | Qtr 3 | 0 | 0 | 1 | Poniedziałek | Sierpień | 36 | 20230825 | 20230829 | 20230829 | 20230829 | 20230829 | TUE | 0 | 20230829 | 0 | 0 | 0 | 0.0 | 20230816 | 20230816 | 20230831 | 45166 | 1900-01-01 | 9999-12-31 | 1 | 2020-07-03 13:51:15.673 | 1904719 | 2020-07-03 13:51:15.673 | 1904719 | 23.25 | 17.55 | 19.45 | 0.05 | 0.0 | 334471.057583 |
| 970 | 2023-08-29 | 916679.147861 | 2023-08-29 | 2023 | 2023 | 8.2023 | T35-2023 | Q3-2023 | 8 | 29 | th | 3 | Tuesday | TUE | T | 241 | 5 | 35 | 35 | August | AUG | A | 3 | Third | Qtr 3 | 0 | 0 | 2 | Wtorek | Sierpień | 36 | 20230828 | 20230830 | 20230830 | 20230830 | 20230830 | WED | 0 | 20230830 | 0 | 0 | 0 | 0.0 | 20230816 | 20230816 | 20230831 | 45167 | 1900-01-01 | 9999-12-31 | 1 | 2020-07-03 13:51:15.673 | 1904719 | 2020-07-03 13:51:15.673 | 1904719 | 25.55 | 17.85 | 21.05 | 0.45 | 0.0 | 407334.926198 |
| 971 | 2023-08-30 | 927152.139888 | 2023-08-30 | 2023 | 2023 | 8.2023 | T35-2023 | Q3-2023 | 8 | 30 | th | 4 | Wednesday | WED | W | 242 | 5 | 35 | 35 | August | AUG | A | 3 | Third | Qtr 3 | 0 | 0 | 3 | Środa | Sierpień | 36 | 20230829 | 20230831 | 20230831 | 20230831 | 20230831 | THU | 0 | 20230831 | 0 | 0 | 0 | 0.0 | 20230816 | 20230816 | 20230831 | 45168 | 1900-01-01 | 9999-12-31 | 1 | 2020-07-03 13:51:15.673 | 1904719 | 2020-07-03 13:51:15.673 | 1904719 | 21.80 | 16.70 | 18.30 | 0.15 | 0.0 | 354357.441720 |
| 972 | 2023-08-31 | 730302.951767 | 2023-08-31 | 2023 | 2023 | 8.2023 | T35-2023 | Q3-2023 | 8 | 31 | st | 5 | Thursday | THU | T | 243 | 5 | 35 | 35 | August | AUG | A | 3 | Third | Qtr 3 | 0 | 0 | 4 | Czwartek | Sierpień | 36 | 20230830 | 20230901 | 20230901 | 20230901 | 20230901 | FRI | 0 | 20230901 | 0 | 0 | 0 | 0.0 | 20230816 | 20230816 | 20230831 | 45169 | 1900-01-01 | 9999-12-31 | 1 | 2020-07-03 13:51:15.673 | 1904719 | 2020-07-03 13:51:15.673 | 1904719 | 22.45 | 14.80 | 17.05 | 0.05 | 0.0 | 275094.376827 |
shape(combined_df)
Number of rows and columns: (973, 60)
missing_values_statistic(combined_df)
| Missing values count | Percentage of missing data | |
|---|---|---|
| date | 0 | 0.000000 |
| Volume | 0 | 0.000000 |
| dateId | 0 | 0.000000 |
| dateYear | 0 | 0.000000 |
| dateYearISO | 0 | 0.000000 |
| dateMonthYear | 0 | 0.000000 |
| dateWeekYearISO | 0 | 0.000000 |
| dateQuarterYear | 0 | 0.000000 |
| dateMonth | 0 | 0.000000 |
| dateDay | 0 | 0.000000 |
| dateDaySuffix | 0 | 0.000000 |
| dateWeekDay | 0 | 0.000000 |
| dateWeekDayName | 0 | 0.000000 |
| dateWeekDayNameShort | 0 | 0.000000 |
| dateWeekDayNameFirstLetter | 0 | 0.000000 |
| dateDayOfYear | 0 | 0.000000 |
| dateWeekOfMonth | 0 | 0.000000 |
| dateWeekOfYear | 0 | 0.000000 |
| dateWeekOfYearISO | 0 | 0.000000 |
| dateMonthName | 0 | 0.000000 |
| dateMonthNameShort | 0 | 0.000000 |
| dateMonthNameFirstLetter | 0 | 0.000000 |
| dateQuarter | 0 | 0.000000 |
| dateQuarterName | 0 | 0.000000 |
| dateQuarterShort | 0 | 0.000000 |
| dateIsWeekend | 0 | 0.000000 |
| dateIsHolidayInd | 0 | 0.000000 |
| dateWeekDayStartsMonday | 0 | 0.000000 |
| dateWeekDayNamePL | 0 | 0.000000 |
| dateMonthNamePL | 0 | 0.000000 |
| dateWeekOfYearStartsMonday | 0 | 0.000000 |
| firstDayWorkBefore | 0 | 0.000000 |
| firstDayWorkAfter | 0 | 0.000000 |
| firstDayNoHolidayNoSaturdayAfter | 0 | 0.000000 |
| firstDayNoHolidayNoSundayAfter | 0 | 0.000000 |
| firstDayNoHolidayNoSaturdayAfterData | 0 | 0.000000 |
| firstDayNoSaturdayNoHolidayOrSundayHolidayName | 0 | 0.000000 |
| firstDayNoSaturdayNoHolidayOrSundayHolidayIsHoliday | 0 | 0.000000 |
| firstDayNoSaturdayNoHolidayOrSundayHoliday | 0 | 0.000000 |
| isThuHolyday | 0 | 0.000000 |
| isFriHolyday | 0 | 0.000000 |
| isSatHolyday | 0 | 0.000000 |
| isWeekendParcelServiceBreakInd | 0 | 0.000000 |
| courierBillingPeriodDateFk | 0 | 0.000000 |
| courierBillingPeriodStartDateFk | 0 | 0.000000 |
| courierBillingPeriodEndDateFk | 0 | 0.000000 |
| xlsDate | 0 | 0.000000 |
| dwhDateFrom | 0 | 0.000000 |
| dwhDateTo | 0 | 0.000000 |
| dwhCurrRec | 0 | 0.000000 |
| dwhAuditCd | 0 | 0.000000 |
| dwhAuditCp | 0 | 0.000000 |
| dwhAuditMd | 0 | 0.000000 |
| dwhAuditMp | 0 | 0.000000 |
| AvgMaxTemp | 0 | 0.000000 |
| AvgMinTemp | 0 | 0.000000 |
| AvgMeanTemp | 0 | 0.000000 |
| AvgPrecipitation | 0 | 0.000000 |
| AvgSnowCover | 0 | 0.000000 |
| Orders | 730 | 75.025694 |
In the combined table, only the Orders column has missing data, which is because this data was only provided from the beginning of 2023. This column contains important information that is forecasted by customer X, but its usefulness in predictive modeling is limited by the large number of missing values. The lack of historical data limits the model's ability to predict based on long-term trends.
client_x_summed_df = posting_volume_df[posting_volume_df['Customer'] == 'X'].groupby('date')['Volume'].sum().reset_index()
client_rest_summed_df = posting_volume_df[posting_volume_df['Customer'] == 'Rest'].groupby('date')['Volume'].sum().reset_index()
combined_df_x = client_x_summed_df.merge(dim_dates_df, on='date', how='left') \
.merge(temperature_avg_df, on='date', how='left') \
.merge(clients_orders_df, on='date', how='left')
combined_df_rest = client_rest_summed_df.merge(dim_dates_df, on='date', how='left') \
.merge(temperature_avg_df, on='date', how='left') \
.merge(clients_orders_df, on='date', how='left')
Helper function:
def plot_grouped_bar_chart(combined_df, combined_df_x, combined_df_rest, group_by_col, x_labels, x_label, y_label, title):
group_analysis = combined_df.groupby(group_by_col)['Volume'].mean().reset_index()
group_analysis_x = combined_df_x.groupby(group_by_col)['Volume'].mean().reset_index()
group_analysis_rest = combined_df_rest.groupby(group_by_col)['Volume'].mean().reset_index()
group_analysis['Customer'] = 'All'
group_analysis_x['Customer'] = 'X'
group_analysis_rest['Customer'] = 'Rest'
group_combined = pd.concat([group_analysis, group_analysis_x, group_analysis_rest])
plt.figure(figsize=(12, 6))
sns.barplot(data=group_combined, x=group_by_col, y='Volume', hue='Customer', palette=['gold', 'black', 'darkgray'])
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.title(title)
plt.xticks(ticks=np.arange(len(x_labels)), labels=x_labels)
plt.show()
plot_grouped_bar_chart(
combined_df,
combined_df_x,
combined_df_rest,
group_by_col='dateMonth',
x_labels=['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'],
x_label='Month',
y_label='Average Volume',
title='Average Volume by Month'
)
plot_grouped_bar_chart(
combined_df,
combined_df_x,
combined_df_rest,
group_by_col='dateWeekDay',
x_labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'],
x_label='Day of the Week',
y_label='Average Volume',
title='Average Volume by Day of the Week'
)
The highest number of shipments is recorded on Thursdays and Fridays, which may suggest an increase in shipment activity towards the end of the work week.
The lowest number of shipments is recorded on Sundays and Tuesdays.
plot_grouped_bar_chart(
combined_df,
combined_df_x,
combined_df_rest,
group_by_col='dateIsWeekend',
x_labels=['Weekday', 'Weekend'],
x_label='Is Weekend',
y_label='Average Volume',
title='Average Volume by Weekend'
)
plot_grouped_bar_chart(
combined_df,
combined_df_x,
combined_df_rest,
group_by_col='dateIsHolidayInd',
x_labels=['Non-Holiday', 'Holiday'],
x_label='Is Holiday',
y_label='Average Volume',
title='Average Volume by Holiday'
)
combined_df_x_dropna = combined_df_x.dropna(subset=['Orders'])
combined_df_x_dropna.loc[:, 'Forecast_Error'] = combined_df_x_dropna['Volume'] - combined_df_x_dropna['Orders']
mae = mean_absolute_error(combined_df_x_dropna['Volume'], combined_df_x_dropna['Orders'])
rmse = mean_squared_error(combined_df_x_dropna['Volume'], combined_df_x_dropna['Orders'], squared=False)
print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)
plt.figure(figsize=(14, 8))
plt.plot(combined_df_x_dropna['date'], combined_df_x_dropna['Volume'], label='Actual Volume', color='black')
plt.plot(combined_df_x_dropna['date'], combined_df_x_dropna['Orders'], label='Forecasted Orders', linestyle='--', color='gold')
plt.xlabel('Date')
plt.ylabel('Volume')
plt.title('Actual vs Forecasted Volume for Client X')
plt.legend()
plt.show()
for month in range(1, 9):
monthly_data = combined_df_x_dropna[combined_df_x_dropna['dateMonth'] == month]
plt.figure(figsize=(14, 8))
plt.plot(monthly_data['date'], monthly_data['Volume'], label='Actual Volume', color='black')
plt.plot(monthly_data['date'], monthly_data['Orders'], label='Forecasted Orders', linestyle='--', color='gold')
plt.xlabel('Date')
plt.ylabel('Volume')
plt.title(f'Actual vs Forecasted Volume for Client X in Month {month}')
plt.legend()
plt.show()
C:\Users\sonia\AppData\Local\Temp\ipykernel_18272\14278924.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy combined_df_x_dropna.loc[:, 'Forecast_Error'] = combined_df_x_dropna['Volume'] - combined_df_x_dropna['Orders']
Mean Absolute Error (MAE): 100367.75433039053 Root Mean Squared Error (RMSE): 120469.5230958793
Helper function:
def plot_weather_impact(data, data_x, data_rest, x_col, y_col, x_label, y_label, title):
analysis_all = data.groupby(x_col)[y_col].mean().reset_index()
analysis_x = data_x.groupby(x_col)[y_col].mean().reset_index()
analysis_rest = data_rest.groupby(x_col)[y_col].mean().reset_index()
analysis_all['Customer'] = 'All'
analysis_x['Customer'] = 'X'
analysis_rest['Customer'] = 'Rest'
combined_analysis = pd.concat([analysis_all, analysis_x, analysis_rest])
plt.figure(figsize=(12, 6))
sns.scatterplot(data=combined_analysis, x=x_col, y=y_col, hue='Customer', palette=['gold', 'black', 'darkgray'])
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.title(title)
plt.show()
plot_weather_impact(combined_df, combined_df_x, combined_df_rest, 'AvgMeanTemp', 'Volume', 'Average Mean Temperature (°C)', 'Volume', 'Impact of Average Mean Temperature on Volume')
plot_weather_impact(combined_df, combined_df_x, combined_df_rest, 'AvgPrecipitation', 'Volume', 'Average Precipitation (mm)', 'Volume', 'Impact of Average Precipitation on Volume')
plot_weather_impact(combined_df, combined_df_x, combined_df_rest, 'AvgSnowCover', 'Volume', 'Average Snow Cover (cm)', 'Volume', 'Impact of Average Snow Cover on Volume')
correlation_matrix_all = combined_df[['Volume', 'AvgMaxTemp', 'AvgMinTemp', 'AvgMeanTemp', 'AvgPrecipitation', 'AvgSnowCover']].corr()
correlation_matrix_x = combined_df_x[['Volume', 'AvgMaxTemp', 'AvgMinTemp', 'AvgMeanTemp', 'AvgPrecipitation', 'AvgSnowCover']].corr()
correlation_matrix_rest = combined_df_rest[['Volume', 'AvgMaxTemp', 'AvgMinTemp', 'AvgMeanTemp', 'AvgPrecipitation', 'AvgSnowCover']].corr()
plt.figure(figsize=(5, 4))
sns.heatmap(correlation_matrix_all, annot=True, cmap='crest')
plt.title('Correlation Matrix (All)')
plt.figure(figsize=(5, 4))
sns.heatmap(correlation_matrix_x, annot=True, cmap='crest')
plt.title('Correlation Matrix (Client X)')
plt.figure(figsize=(5, 4))
sns.heatmap(correlation_matrix_rest, annot=True, cmap='crest')
plt.title('Correlation Matrix (Rest)')
plt.show()
First, I discard data that is redundant with other variables or not useful for predictive modeling of shipment volume:
selected_columns = [
'dateYear', 'dateMonth', 'dateDay', 'dateWeekDay', 'dateIsWeekend',
'dateIsHolidayInd', 'dateDayOfYear', 'dateWeekOfMonth', 'dateWeekOfYear', 'dateQuarter',
'isThuHolyday', 'isFriHolyday', 'isSatHolyday', 'AvgMaxTemp', 'AvgMinTemp', 'AvgMeanTemp',
'AvgPrecipitation', 'AvgSnowCover'
]
X = combined_df[selected_columns]
y = combined_df['Volume']
corr_matrix = X.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='crest')
plt.show()
def get_highly_correlated_features(corr_matrix, threshold=0.8):
corr_pairs = corr_matrix.unstack()
high_corr_pairs = corr_pairs[(abs(corr_pairs) > threshold) & (corr_pairs != 1)]
high_corr_df = pd.DataFrame(high_corr_pairs).reset_index()
high_corr_df.columns = ['Feature1', 'Feature2', 'Correlation']
high_corr_df = high_corr_df.sort_values(by='Correlation', ascending=False)
high_corr_df['sorted_pair'] = high_corr_df.apply(lambda row: tuple(sorted([row['Feature1'], row['Feature2']])), axis=1)
high_corr_df = high_corr_df.drop_duplicates(subset='sorted_pair')
high_corr_df = high_corr_df.drop(columns='sorted_pair')
return high_corr_df
high_corr_features = get_highly_correlated_features(corr_matrix, threshold=0.5)
print("Most correlated features (correlation threshold > 0.8):")
print(high_corr_features)
Most correlated features (correlation threshold > 0.8):
Feature1 Feature2 Correlation
9 dateWeekOfYear dateDayOfYear 0.999546
0 dateMonth dateDayOfYear 0.996216
1 dateMonth dateWeekOfYear 0.995773
20 AvgMeanTemp AvgMaxTemp 0.982468
19 AvgMinTemp AvgMeanTemp 0.973642
2 dateMonth dateQuarter 0.969524
12 dateQuarter dateDayOfYear 0.966191
10 dateWeekOfYear dateQuarter 0.965722
7 dateWeekOfMonth dateDay 0.955137
16 AvgMaxTemp AvgMinTemp 0.921066
14 isFriHolyday isSatHolyday 0.537515
Based on the data correlation analysis, in the further modeling process, you can skip:
features_to_remove = [
'dateWeekOfYear',
'dateDayOfYear',
'dateQuarter',
'AvgMeanTemp',
'AvgMinTemp',
'isSatHolyday',
'dateWeekOfMonth',
]
X_sel = X.drop(columns=features_to_remove)
info(X_sel)
Info about the data frame: <class 'pandas.core.frame.DataFrame'> RangeIndex: 973 entries, 0 to 972 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 dateYear 973 non-null int64 1 dateMonth 973 non-null int64 2 dateDay 973 non-null int64 3 dateWeekDay 973 non-null int64 4 dateIsWeekend 973 non-null int64 5 dateIsHolidayInd 973 non-null int64 6 isThuHolyday 973 non-null int64 7 isFriHolyday 973 non-null int64 8 AvgMaxTemp 973 non-null float64 9 AvgPrecipitation 973 non-null float64 10 AvgSnowCover 973 non-null float64 dtypes: float64(3), int64(8) memory usage: 83.7 KB
None
corr_matrix = X_sel.corr()
plt.figure(figsize=(6, 5))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='crest')
plt.show()
X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.25, random_state=7)
Scaling numeric data:
numeric_features = ['AvgMaxTemp', 'AvgPrecipitation', 'AvgSnowCover']
scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])
print("Scaled X_train:")
display(X_train.head())
Scaled X_train:
| dateYear | dateMonth | dateDay | dateWeekDay | dateIsWeekend | dateIsHolidayInd | isThuHolyday | isFriHolyday | AvgMaxTemp | AvgPrecipitation | AvgSnowCover | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 442 | 2022 | 3 | 19 | 7 | 1 | 0 | 0 | 0 | -0.599649 | -0.436524 | -0.242441 |
| 871 | 2023 | 5 | 22 | 2 | 0 | 0 | 0 | 0 | 0.802564 | -0.436524 | -0.242441 |
| 954 | 2023 | 8 | 13 | 1 | 1 | 0 | 0 | 0 | 1.627086 | -0.436524 | -0.242441 |
| 113 | 2021 | 4 | 24 | 7 | 1 | 0 | 0 | 0 | -0.331810 | -0.436524 | -0.242441 |
| 499 | 2022 | 5 | 15 | 1 | 1 | 0 | 0 | 0 | 0.602998 | -0.436524 | -0.242441 |
Regressors:
pipeline_dt=Pipeline([("scalar1",StandardScaler()),
("dt",DecisionTreeRegressor())])
pipeline_lr=Pipeline([("scalar2",StandardScaler()),
("lr",LinearRegression())])
pipeline_rf=Pipeline([("scalar3",StandardScaler()),
("rf",RandomForestRegressor())])
pipeline_xgb=Pipeline([("scalar4",StandardScaler()),
("xgb",XGBRegressor())])
pipeline_kn=Pipeline([("scalar5",StandardScaler()),
("kn",KNeighborsRegressor())])
param_grid_dt = {
'dt__max_depth': [3, 5, 10, None],
'dt__min_samples_split': [2, 5, 10],
'dt__min_samples_leaf': [1, 2, 4]
}
param_grid_lr = {
'lr__fit_intercept': [True, False]
}
param_grid_rf = {
'rf__n_estimators': [50, 100, 200],
'rf__max_depth': [None, 10, 20],
'rf__min_samples_split': [2, 5, 10],
'rf__min_samples_leaf': [1, 2, 4]
}
param_grid_xgb = {
'xgb__n_estimators': [50, 100, 200],
'xgb__learning_rate': [0.01, 0.1, 0.2],
'xgb__max_depth': [3, 5, 10],
'xgb__subsample': [0.6, 0.8, 1.0]
}
param_grid_kn = {
'kn__n_neighbors': [3, 5, 10],
'kn__weights': ['uniform', 'distance'],
'kn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}
grid_search_lr = GridSearchCV(estimator=pipeline_lr, param_grid=param_grid_lr, scoring='neg_root_mean_squared_error', cv=10)
grid_search_dt = GridSearchCV(estimator=pipeline_dt, param_grid=param_grid_dt, scoring='neg_root_mean_squared_error', cv=10)
grid_search_rf = GridSearchCV(estimator=pipeline_rf, param_grid=param_grid_rf, scoring='neg_root_mean_squared_error', cv=10)
grid_search_kn = GridSearchCV(estimator=pipeline_kn, param_grid=param_grid_kn, scoring='neg_root_mean_squared_error', cv=10)
grid_search_xgb = GridSearchCV(estimator=pipeline_xgb, param_grid=param_grid_xgb, scoring='neg_root_mean_squared_error', cv=10)
grid_searches = [grid_search_dt, grid_search_lr, grid_search_rf, grid_search_xgb, grid_search_kn]
grid_dict = {0: "DecisionTree", 1: "LinearRegression", 2: "RandomForest", 3: "XGBRegressor", 4: "KNeighbors"}
for idx, gs in enumerate(grid_searches):
gs.fit(X_train, y_train)
print(f"{grid_dict[idx]} best params: {gs.best_params_}")
print(f"{grid_dict[idx]} best score: {-gs.best_score_}")
DecisionTree best params: {'dt__max_depth': None, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 10}
DecisionTree best score: 186245.34053360097
LinearRegression best params: {'lr__fit_intercept': True}
LinearRegression best score: 225881.30692869253
RandomForest best params: {'rf__max_depth': None, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 200}
RandomForest best score: 150010.8004436432
XGBRegressor best params: {'xgb__learning_rate': 0.2, 'xgb__max_depth': 3, 'xgb__n_estimators': 200, 'xgb__subsample': 0.8}
XGBRegressor best score: 99119.28015703599
KNeighbors best params: {'kn__algorithm': 'auto', 'kn__n_neighbors': 3, 'kn__weights': 'distance'}
KNeighbors best score: 183510.17624733265
The best result was obtained for XGBRegressor with parameters: {'xgblearning_rate': 0.2, 'xgbmax_depth': 3, 'xgbn_estimators': 200, 'xgbsubsample': 0.8}.
best_xgb_params = {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
xgb_model = XGBRegressor(**best_xgb_params)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'MAE: {mae}')
print(f'RMSE: {rmse}')
MAE: 74198.77597161685 RMSE: 97654.37179169404
The high standard deviation of the shipment volume (87922.90) indicates a large variability in the data, so the MAE value may be caused by single large forecasting errors that have a significant impact on the total result (also indicated by the RMSE being larger than the MAE). The comparison of the actual shipment volume with the forecasted values can be better illustrated with a graph:
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(y_test))), y=y_test, mode='lines', name='Actual Volume', line=dict(color='black')))
fig.add_trace(go.Scatter(x=list(range(len(y_test))), y=y_pred, mode='lines', name='Predicted Volume', line=dict(dash='dash', color='gold')))
fig.update_layout(
title='Actual vs Predicted Volume',
xaxis_title='Sample',
yaxis_title='Volume',
width=2000,
height=600,
plot_bgcolor='white',
paper_bgcolor='white',
xaxis=dict(
showgrid=True,
gridcolor='lightgray'
),
yaxis=dict(
showgrid=True,
gridcolor='lightgray'
)
)
fig.show()
The graph shows that the predicted values are close to the actual values, which indicates good model performance. Single large errors are visible, especially when there is a large decrease or increase in the number of shipments.
plt.figure(figsize=(14, 8))
plt.scatter(y_test, y_pred, color='gold')
plt.xlabel('Actual Volume')
plt.ylabel('Predicted Volume')
plt.title('Actual vs Predicted Volume Scatter Plot')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='black', linestyle='--', lw=2)
plt.show()
Points near the diagonal line suggest good agreement between actual and forecast values. There are some outliers, especially at the beginning and end of the value range.